library(magrittr)
library(tinytex)
library(ALSM)
## Loading required package: leaps
## Loading required package: SuppDists
## Loading required package: car
## Loading required package: carData
library(stringr)
library(stats)

#plotly Regression packages
library(reshape2) # to load tips data
library(tidymodels) # for the fit() function
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom        0.7.9      ✓ recipes      0.1.17
## ✓ dials        0.0.10     ✓ rsample      0.1.1 
## ✓ dplyr        1.0.7      ✓ tibble       3.1.6 
## ✓ ggplot2      3.3.5      ✓ tidyr        1.1.4 
## ✓ infer        1.0.0      ✓ tune         0.1.6 
## ✓ modeldata    0.1.1      ✓ workflows    0.2.4 
## ✓ parsnip      0.1.7      ✓ workflowsets 0.1.0 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.9
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard()   masks scales::discard()
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x recipes::fixed()   masks stringr::fixed()
## x dplyr::lag()       masks stats::lag()
## x dplyr::recode()    masks car::recode()
## x purrr::set_names() masks magrittr::set_names()
## x purrr::some()      masks car::some()
## x recipes::step()    masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data(tips)

CA demographics dataset can be downloaded from: https://www.countyhealthrankings.org/app/california/2021/downloads

#main CA demographics dataset
demo <- read.csv(file="demographics.csv")

 #main working df
df.demo <- demo[2:58,c(3,4,7,8,9,10,11,14,17,18,19,20,21,22,23)]
df.demo[,c(2,4,6,8,12)] <- df.demo[,c(2,4,6,8,12)] / 100 #change % columns to decimal

#main covid case data from jhu.edu
covid <- read.csv(file="CA-2020-cov-data.csv") %>% dplyr::rename(Cases = Case.Count.2020)

df.main <- left_join(df.demo, covid, by="County") %>% dplyr::select(Cases,County,-X,everything())
df.main <- df.main[,1:16]
plot(df.main)

Experiment 1: Full MLR

Y <- df.main$Cases -> Confirmed.COVID19.Cases
X1 <- df.main$NUM.FOOD.INSECURE -> Food.Insecurity
X2 <- df.main$PERCENT.ADULT.DIABETES -> Percent.Adult.Diabetes
X3 <- df.main$PERCENT.OBESITY -> Percent.Adult.Obesity
X4 <- df.main$PERCENT.LIMITED.ACCESS -> Access.to.Healthy.Food
X5 <- df.main$MEDIAN.HOUSEHOLD.INCOME -> Median.Household.Income
X6 <- df.main$AVG.DAILY.PM2.5 -> Air.Pollution

df.full <- cbind(Y,X1,X2,X3,X4,X5,X6) %>% as.data.frame

mod.full <- lm(Confirmed.COVID19.Cases~Food.Insecurity+Percent.Adult.Diabetes+Percent.Adult.Obesity+Access.to.Healthy.Food+Median.Household.Income+Air.Pollution, df.full)

summary(mod.full)
## 
## Call:
## lm(formula = Confirmed.COVID19.Cases ~ Food.Insecurity + Percent.Adult.Diabetes + 
##     Percent.Adult.Obesity + Access.to.Healthy.Food + Median.Household.Income + 
##     Air.Pollution, data = df.full)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5750668  -480655   243852   590083  3111269 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -1.816e+06  2.544e+06  -0.714    0.479    
## Food.Insecurity          5.079e+01  1.468e+00  34.600   <2e-16 ***
## Percent.Adult.Diabetes  -5.907e+05  8.855e+06  -0.067    0.947    
## Percent.Adult.Obesity    4.058e+06  6.351e+06   0.639    0.526    
## Access.to.Healthy.Food   6.625e+06  5.915e+06   1.120    0.268    
## Median.Household.Income -2.057e+01  1.611e+01  -1.277    0.207    
## Air.Pollution            1.010e+05  8.448e+04   1.196    0.237    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1557000 on 50 degrees of freedom
## Multiple R-squared:  0.9689, Adjusted R-squared:  0.9652 
## F-statistic: 259.6 on 6 and 50 DF,  p-value: < 2.2e-16
anova(mod.full)
## Analysis of Variance Table
## 
## Response: Confirmed.COVID19.Cases
##                         Df     Sum Sq    Mean Sq   F value    Pr(>F)    
## Food.Insecurity          1 3.7369e+15 3.7369e+15 1541.3574 < 2.2e-16 ***
## Percent.Adult.Diabetes   1 4.6145e+12 4.6145e+12    1.9033  0.173844    
## Percent.Adult.Obesity    1 1.8402e+13 1.8402e+13    7.5902  0.008165 ** 
## Access.to.Healthy.Food   1 8.1325e+12 8.1325e+12    3.3544  0.072985 .  
## Median.Household.Income  1 4.1984e+12 4.1984e+12    1.7317  0.194191    
## Air.Pollution            1 3.4665e+12 3.4665e+12    1.4298  0.237433    
## Residuals               50 1.2122e+14 2.4244e+12                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova
## function (mod, ...) 
## {
##     UseMethod("Anova", mod)
## }
## <bytecode: 0x7fb1ceb8d080>
## <environment: namespace:car>

From the ANOVA results of the full MLR, it is clear further model refinement is required. Therefore, we plot the MLR model criteria with plotmodel.s function, part of the ALSM package. Additionally, the Best Subsets Algorithm (also from ALSM) is utilized to determine the best model for this experiment.

#Plot Criterias for Model Selection
plotmodel.s(df.full[,2:7],df.full$Y)

#Best Subsets Algorithm for Model Selection
bs.full <- BestSub(df.full[2:7],df.full$Y,num=1) %>% as.data.frame
bs.full
##   p 1 2 3 4 5 6         SSEp        r2    r2.adj        Cp     AICp     SBCp
## 1 2 1 0 0 0 0 0 1.600337e+14 0.9589331 0.9581865 13.009519 1637.811 1641.897
## 2 3 1 0 0 0 1 0 1.289219e+14 0.9669168 0.9656915  2.176782 1627.489 1633.618
## 3 4 1 0 0 0 1 1 1.251742e+14 0.9678786 0.9660604  2.630948 1627.808 1635.980
## 4 5 1 0 0 1 1 1 1.224698e+14 0.9685725 0.9661550  3.515448 1628.563 1638.778
## 5 6 1 0 1 1 1 1 1.212309e+14 0.9688905 0.9658405  5.004449 1629.983 1642.241
## 6 7 1 1 1 1 1 1 1.212201e+14 0.9688932 0.9651604  7.000000 1631.978 1646.279
##         PRESSp
## 1 4.359156e+14
## 2 3.441509e+14
## 3 4.293698e+14
## 4 4.277007e+14
## 5 4.270869e+14
## 6 4.295439e+14
#Which r^2 and r^2 adjusted are greatest?
r2.max <- bs.full$r2 %>% max
nrow(bs.full==r2.max)
## [1] 6
r2adj.max <- bs.full$r2.adj %>% max
nrow(bs.full==r2adj.max)
## [1] 6
Cp.min <- bs.full$Cp %>% min
nrow(bs.full==Cp.min)
## [1] 6
AIC.min <- bs.full$AICp %>% min
nrow(bs.full==AIC.min)
## [1] 6
SBC.min <- bs.full$SBCp %>% min
nrow(bs.full==SBC.min)
## [1] 6
PRESSp.min <- bs.full$PRESSp %>% min
nrow(bs.full==PRESSp.min)
## [1] 6

Experiment 2: Food insecurity as a factor for COVID-19 cases H0: b1 = 0 Ha: b1 ~= 0

Y <- df.main$Cases
X <- df.main$NUM.FOOD.INSECURE

df.fi <- cbind(X,Y) %>% as.data.frame

mod.fi <- lm(Y~X, df.fi)

summary(mod.fi)
## 
## Call:
## lm(formula = Y ~ X, data = df.fi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6229243  -314569   401278   644393  3827475 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.690e+05  2.494e+05  -2.683  0.00962 ** 
## X            5.004e+01  1.396e+00  35.837  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1706000 on 55 degrees of freedom
## Multiple R-squared:  0.9589, Adjusted R-squared:  0.9582 
## F-statistic:  1284 on 1 and 55 DF,  p-value: < 2.2e-16
anova(mod.fi)
## Analysis of Variance Table
## 
## Response: Y
##           Df     Sum Sq    Mean Sq F value    Pr(>F)    
## X          1 3.7369e+15 3.7369e+15  1284.3 < 2.2e-16 ***
## Residuals 55 1.6003e+14 2.9097e+12                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot linear model data
plot(mod.fi)

#plot data with regression line
plot(X,Y,pch = 16, cex = 1.3, col = "blue", main = "Confirmed COVID-19 Cases vs. Food Insecurity", xlab = "Food Insecurity", ylab = "Confirmed COVID-19 Cases")
abline(mod.fi)

#Plotly
fig.fi <- plot_ly(df.fi, x = ~X, y = ~Y, 
               type = 'scatter',
               alpha=0.65,
               mode='markers',
               name='COVID-19 Cases') %>% 
  add_lines(x=~X,y=fitted(mod.fi),name='Regression Line') %>%
  layout(title='COVID-19 Cases vs. Food Insecurity',
         xaxis = list(title='Number of Food Insecure Adults'),
         yaxis = list(title='Confirmed COVID-19 Cases'))
fig.fi

Experiment 3: Comparing Food Insecurity vs. Health factors such as obesity and diabetes. H0: b1 = 0, b2 = 0, b3 = 0 Ha: not H0

Y <- df.main$Cases -> Confirmed.COVID19.Cases
X1 <- df.main$NUM.FOOD.INSECURE -> Food.Insecurity
X2 <- df.main$PERCENT.ADULT.DIABETES -> Percent.Adult.Diabetes
X3 <- df.main$PERCENT.OBESITY -> Percent.Adult.Obesity
X4 <- df.main$PERCENT.LIMITED.ACCESS -> Access.to.Healthy.Food
X5 <- df.main$MEDIAN.HOUSEHOLD.INCOME -> Median.Household.Income
X6 -> df.main$AVG.DAILY.PM2.5 -> Air.Pollution

df.fih <- cbind(Y,X1,X2,X3,X4,X5,X6) %>% as.data.frame

mod.fih <- lm(Confirmed.COVID19.Cases~Food.Insecurity+Percent.Adult.Diabetes+Percent.Adult.Obesity, df.fih)

summary(mod.fih)
## 
## Call:
## lm(formula = Confirmed.COVID19.Cases ~ Food.Insecurity + Percent.Adult.Diabetes + 
##     Percent.Adult.Obesity, data = df.fih)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5765097  -386068   231993   793507  3300537 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -3.895e+06  1.128e+06  -3.452   0.0011 ** 
## Food.Insecurity         5.112e+01  1.367e+00  37.402   <2e-16 ***
## Percent.Adult.Diabetes -4.952e+06  8.724e+06  -0.568   0.5727    
## Percent.Adult.Obesity   1.325e+07  4.965e+06   2.668   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1608000 on 53 degrees of freedom
## Multiple R-squared:  0.9648, Adjusted R-squared:  0.9628 
## F-statistic: 484.8 on 3 and 53 DF,  p-value: < 2.2e-16
anova(mod.fih)
## Analysis of Variance Table
## 
## Response: Confirmed.COVID19.Cases
##                        Df     Sum Sq    Mean Sq   F value Pr(>F)    
## Food.Insecurity         1 3.7369e+15 3.7369e+15 1445.4658 <2e-16 ***
## Percent.Adult.Diabetes  1 4.6145e+12 4.6145e+12    1.7849 0.1873    
## Percent.Adult.Obesity   1 1.8402e+13 1.8402e+13    7.1180 0.0101 *  
## Residuals              53 1.3702e+14 2.5852e+12                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod.fih)

#Plotly
fig.fih <- plot_ly(df.fih, x = ~X1+X2+X3, y = ~Y, 
               type='scatter', 
               alpha=0.65, 
               mode='markers',
               name='COVID-19 Cases') %>%
  add_lines(x=~X1+X2+X3,y=fitted(mod.fih),name='Regression Line') %>%
  layout(title='COVID-19 Cases vs. Food Insecurity+Obesity+Diabetes',
         xaxis = list(title='Food Insecurity + Obesity + Diabetes'),
         yaxis = list(title='Confirmed COVID-19 Cases'))
fig.fih

Experiment 3: Unemployment as a factor for COVID-19 cases H0: b1 = 0 Ha: b1 ~= 0

Y <- df.main$Cases
X <- df.main$NUM.UNEMPLOYED

df <- cbind(X,Y) %>% as.data.frame

mod.unemp <- lm(Y~X, df)

summary(mod.unemp)
## 
## Call:
## lm(formula = Y ~ X, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2273821  -301636   188771   371340  3418308 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.552e+05  1.285e+05  -3.542 0.000818 ***
## X            2.499e+02  3.584e+00  69.730  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 890200 on 55 degrees of freedom
## Multiple R-squared:  0.9888, Adjusted R-squared:  0.9886 
## F-statistic:  4862 on 1 and 55 DF,  p-value: < 2.2e-16
anova(mod.unemp)
## Analysis of Variance Table
## 
## Response: Y
##           Df     Sum Sq    Mean Sq F value    Pr(>F)    
## X          1 3.8533e+15 3.8533e+15  4862.2 < 2.2e-16 ***
## Residuals 55 4.3588e+13 7.9250e+11                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot linear model data
plot(mod.unemp)

#Plotly
fig.unemp <- plot_ly(df, x = ~X, y = ~Y, 
               type='scatter', 
               alpha=0.65, 
               mode='markers',
               name='COVID-19 Cases') %>%
  add_lines(x=~X,y=fitted(mod.unemp),name='Regression Line') %>%
  layout(title='COVID-19 Cases vs. Unemployment',
         xaxis = list(title='Number of Unemployed Adults'),
         yaxis = list(title='Confirmed COVID-19 Cases'))
fig.unemp